Accessories¶

In [301]:
import pandas as pd
accessories = pd.read_csv('accessories.csv') 
In [302]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
accessories = pd.read_csv('accessories.csv')

# Split the dataset
train_set_accessories, test_set_accessories = train_test_split(accessories, test_size=0.1, random_state=None)

# Display the number of rows in each set
print("Training set accessories size:", len(train_set_accessories))
print("Testing set accessories size:", len(test_set_accessories))
Training set accessories size: 5722
Testing set accessories size: 636
In [303]:
import matplotlib.pyplot as plt
import seaborn as sns
# Set the style for the plots
sns.set(style="whitegrid")

Accessories and Random 10% Top and Bottom¶

In [304]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style for the plots
sns.set(style="whitegrid")

# Plot: Distribution of product categories
plt.figure(figsize=(10, 6))
sns.countplot(y='subcategory', data=test_set_accessories, order=test_set_accessories['subcategory'].value_counts().index[:10], palette='viridis')
plt.title('Top 10 accessories Category Products ')
plt.xlabel('Count')
plt.ylabel('subcategory')
plt.show()

# Get the counts of each subcategory
subcategory_counts = test_set_accessories['subcategory'].value_counts()

# Get the Top 10 subcategories
top_10_subcategories = subcategory_counts.head(10)
total_subcategories = subcategory_counts.sum()

# Display the Top 10 subcategories and their counts
print(top_10_subcategories)
print('Total accessories       ',total_subcategories)
subcategory
Chapeaux & Bonnets        106
Beanie Hat                 78
Baseball Caps              60
Foulards & Écharpes        56
Bucket Hat                 44
Flat Caps                  33
Chaussettes                30
Chaussettes & Collants     26
Gants                      26
Ceintures & Bretelles      20
Name: count, dtype: int64
Total accessories        636
In [305]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Assuming 'data_cleaned' is your DataFrame

# Get the counts of each subcategory
subcategory_counts = test_set_accessories['subcategory'].value_counts()

# Get the bottom 10 subcategories
bottom_10_subcategories = subcategory_counts.tail(10)

# Set the style for the plots
sns.set(style="whitegrid")

# Plot: Distribution of bottom 10 product categories
plt.figure(figsize=(10, 6))
sns.countplot(y=test_set_accessories['subcategory'],
              order=bottom_10_subcategories.index,
              palette='viridis')
plt.title('accessories Bottom 10 Product Categories ')
plt.xlabel('Count')
plt.ylabel('Subcategory')
plt.show()

# Display the bottom 10 subcategories and their counts
print(bottom_10_subcategories)
subcategory
Coques de téléphone      6
Tech Accessories         3
Boutons de manchettes    3
Sun Protection Sleeve    3
Ipad Case                2
Autres                   2
Voiles                   1
Favoris de mariage       1
Berets                   1
Fleurs                   1
Name: count, dtype: int64
In [306]:
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_accessories.groupby('subcategory')['likes_count'].sum().sort_values(ascending=False)

# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]

# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Subcategories with Highest Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()

# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory
Beanie Hat                11954
Foulards & Écharpes        8443
Chapeaux & Bonnets         6340
Chaussettes & Collants     4795
Chaussettes                4237
Baseball Caps              3328
Bucket Hat                 3293
Gants                      2657
Flat Caps                  2289
Ceintures & Bretelles      2264
Name: likes_count, dtype: int64
In [307]:
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_accessories.groupby('subcategory')['likes_count'].sum().sort_values(ascending=True)

# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]

# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Subcategories with Least Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()

# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory
Sun Protection Sleeve        10
Berets                       12
Voiles                       24
Ipad Case                    40
Fleurs                       48
Favoris de mariage          128
Tech Accessories            220
Cache-oreilles & Masques    300
Headband & Bandeaux         310
Autres                      365
Name: likes_count, dtype: int64
In [308]:
# Plot: Distribution of current prices
plt.figure(figsize=(10, 6))
sns.histplot(test_set_accessories['current_price'], kde=True, bins=15, color='blue')
plt.title('Distribution of Current Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()
In [309]:
# Plot: Scatter plot of discount vs. likes count
plt.figure(figsize=(10, 8))
sns.scatterplot(x='discount', y='likes_count', data=test_set_accessories, color='red')
plt.title(' accessories Discount vs. Likes Count')
plt.xlabel('Discount (%)')
plt.ylabel('Likes Count')
plt.show()
In [310]:
# Identify continuous variables
continuous_vars = ['current_price', 'raw_price', 'discount', 'likes_count']

# Create box plots for each continuous variable
plt.figure(figsize=(15, 10))
for i, var in enumerate(continuous_vars):
    plt.subplot(2, 2, i + 1)
    sns.boxplot(y=test_set_accessories[var])
    plt.title(f'Box Plot of {var}')

plt.tight_layout()
plt.show()
In [311]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'data_cleaned' is your DataFrame

# Define a function to cap outliers using min and max values
def cap_outliers(df, column):
  Q1 = df[column].quantile(0.25)
  Q3 = df[column].quantile(0.75)
  IQR = Q3 - Q1
  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR
  df[column] = df[column].clip(lower_bound, upper_bound)
  return df

# Apply the function to the columns with outliers
for column in ['current_price', 'raw_price', 'discount', 'likes_count']:
  test_set_accessories = cap_outliers(test_set_accessories, column)


# Create box plots for each continuous variable after outlier treatment
plt.figure(figsize=(15, 10))
for i, var in enumerate(['current_price', 'raw_price', 'discount', 'likes_count']):
  plt.subplot(2, 2, i + 1)
  sns.boxplot(y=test_set_accessories[var])
  plt.title(f'Box Plot of {var} (Outliers Treated)')

plt.tight_layout()
plt.show()
In [312]:
import sklearn
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Identify non-continuous (categorical) variables
categorical_vars = ['subcategory','name']

# Apply label encoding to each categorical variable
for var in categorical_vars:
  test_set_accessories[var] = label_encoder.fit_transform(test_set_accessories[var])

test_set_accessories.head()
Out[312]:
category subcategory name current_price raw_price discount likes_count
640 accessories 8 238 24.5375 48.86 52 204.0
2700 accessories 20 118 8.1700 14.86 45 99.0
5430 accessories 28 417 10.1300 23.87 58 52.0
6156 accessories 2 152 16.4200 37.96 57 25.0
3562 accessories 9 151 11.5800 20.61 44 61.0
In [313]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'data_cleaned' is your DataFrame

# Create discount bins
test_set_accessories['discount_bin'] = pd.cut(test_set_accessories['discount'], bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                                      labels=['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100'])

# Calculate mean likes per discount bin
test_set_discount_likes = test_set_accessories.groupby('discount_bin')['likes_count'].mean().sort_values()

# Plotting the effect of discount on popularity
plt.figure(figsize=(10, 6))
sns.barplot(x=discount_likes.index, y=discount_likes.values, palette='coolwarm')
plt.title('Effect of Discount on Likes Count for accessories')
plt.xlabel('Discount Range (%)')
plt.ylabel('Average Likes Count')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.tight_layout()  # Adjust layout to prevent labels from overlapping
plt.show()
In [314]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'data_cleaned' is your DataFrame

# Plotting the relationship between price and likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='current_price', y='likes_count', data=test_set_accessories, color='blue', alpha=.8)
plt.title('Relationship Between Price and Likes Count for accessories ')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
plt.show()
In [315]:
import pandas as pd
import matplotlib.pyplot as plt


# Extract current_price and likes_count columns
current_price = test_set_accessories['current_price']
likes_count = test_set_accessories['likes_count']

# Calculate the midpoints for current_price and likes_count
mid_price = (current_price.max() + current_price.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='blue', alpha=0.3)

# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at the midpoint of likes
plt.axvline(mid_price, color='black', linewidth=3, linestyle='--')  # Vertical line at the midpoint of price

# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Equal Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mid_price + (mid_price * 0.05), mid_likes + (mid_likes * 0.05), 'Q1', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes + (mid_likes * 0.05), 'Q2', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes - (mid_likes * 0.4), 'Q3', fontsize=14, color='red')
plt.text(mid_price + (mid_price * 0.05), mid_likes - (mid_likes * 0.4), 'Q4', fontsize=14, color='red')

# Show the plot
plt.show()

# Calculate the mean for current_price and likes_count
mean_price = current_price.mean()
mean_likes = likes_count.mean()

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='green', alpha=0.3)

# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at mean of likes
plt.axvline(mean_price, color='black', linewidth=3, linestyle='--')  # Vertical line at mean of price

# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Four Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mean_price + (mean_price * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_price + (mean_price * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')

# Show the plot
plt.show()
In [316]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'data_cleaned' is your DataFrame

# Plotting the relationship between price and likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='discount', y='likes_count', data=test_set_accessories, color='blue', alpha=0.6)
plt.title('Relationship Between Price and Likes Count for accessories ')
plt.xlabel('Discount')
plt.ylabel('Likes Count')
plt.show()
In [317]:
import pandas as pd
import matplotlib.pyplot as plt


# Extract discount and likes_count columns
discount = test_set_accessories['discount']
likes_count = test_set_accessories['likes_count']

# Calculate the midpoints for discount and likes_count
mid_discount = (discount.max() + discount.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='blue', alpha=0.3)

# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at the midpoint of likes
plt.axvline(mid_discount, color='black', linewidth=3, linestyle='--')  # Vertical line at the midpoint of discount

# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Equal Quadrants by Mid Value')
plt.xlabel('Discount')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mid_discount + (mid_discount * 0.05), mid_likes + (mid_likes * 0.05), '   Q1', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes + (mid_likes * 0.05), '    Q2', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes - (mid_likes * 0.4), '     Q3', fontsize=14, color='red')
plt.text(mid_discount + (mid_discount * 0.05), mid_likes - (mid_likes * 0.4), '    Q4', fontsize=14, color='red')

# Show the plot
plt.show()

# Calculate the mean for discount and likes_count
mean_discount = discount.mean()
mean_likes = likes_count.mean()

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='green', alpha=0.3)

# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at mean of likes
plt.axvline(mean_discount, color='black', linewidth=3, linestyle='--')  # Vertical line at mean of price

# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Four Quadrants by Mean')
plt.xlabel('discount')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mean_discount + (mean_discount * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_discount + (mean_discount * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')

# Show the plot
plt.show()
In [318]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

# Select features for clustering (price and discount)
X = test_set_accessories[['current_price', 'discount']]

# Perform K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
test_set_accessories['cluster'] = kmeans.fit_predict(X)

# Calculate average likes per cluster
cluster_likes = test_set_accessories.groupby('cluster')['likes_count'].mean()

# Plot average likes per cluster
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

# Assuming 'data_cleaned' is your DataFrame

# Select features for clustering (price and discount)
X = test_set_accessories[['current_price', 'discount']]

# Perform K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=42)  # Adjust n_clusters as needed
test_set_accessories['cluster'] = kmeans.fit_predict(X)

# Calculate average likes per cluster
cluster_likes = test_set_accessories.groupby('cluster')['likes_count'].mean()

# Plot average likes per cluster
plt.figure(figsize=(10, 6))
sns.barplot(x=cluster_likes.index, y=cluster_likes.values, palette='Set2')
plt.title('Average Likes per Cluster (Based on Price and Discount)')
plt.xlabel('Cluster')
plt.ylabel('Average Likes')
plt.show()
C:\Users\santa\AppData\Roaming\Python\Python311\site-packages\sklearn\cluster\_kmeans.py:1429: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3.
  warnings.warn(
C:\Users\santa\AppData\Roaming\Python\Python311\site-packages\sklearn\cluster\_kmeans.py:1429: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3.
  warnings.warn(

Bags¶

In [319]:
bags = pd.read_csv('bags.csv')
In [320]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
bags = pd.read_csv('bags.csv')

# Split the dataset
train_set_bags, test_set_bags = train_test_split(bags, test_size=0.1, random_state=42)

# Display the number of rows in each set
print("Training set bags size:", len(train_set_bags))
print("Testing set bags size:", len(test_set_bags))
Training set bags size: 5641
Testing set bags size: 627

Bags and Random 10%¶

In [321]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style for the plots
sns.set(style="whitegrid")

# Plot: Distribution of product categories
plt.figure(figsize=(10, 6))
sns.countplot(y='subcategory', data=test_set_bags, order=test_set_bags['subcategory'].value_counts().index[:10], palette='viridis')
plt.title('Top 10  Bags')
plt.xlabel('Count')
plt.ylabel('subcategory')
plt.show()

# Get the counts of each subcategory
subcategory_counts = test_set_bags['subcategory'].value_counts()

# Get the Top 10 subcategories
top_10_subcategories = subcategory_counts.head(10)
total_subcategories = subcategory_counts.sum()


# Display the Top 10 subcategories and their counts
print(top_10_subcategories)
print('Total bags                     ',total_subcategories)
subcategory
Sac bandoulière                 244
Portefeuilles                   141
Sacs à main                      84
Sacs à dos                       78
Sacs chic                        28
Sacs de voyage                   11
Cosmetic Bags                    10
Étui & Sac des monnaies           7
Men's Bags                        7
Sacs de rangement & Trousses      6
Name: count, dtype: int64
Total bags                      627
In [322]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Assuming 'data_cleaned' is your DataFrame

# Get the counts of each subcategory
subcategory_counts = test_set_bags['subcategory'].value_counts()

# Get the bottom 10 subcategories
bottom_10_subcategories = subcategory_counts.tail(10)

# Set the style for the plots
sns.set(style="whitegrid")

# Plot: Distribution of bottom 10 product categories
plt.figure(figsize=(10, 6))
sns.countplot(y=test_set_bags['subcategory'],
              order=bottom_10_subcategories.index,
              palette='viridis')
plt.title('Bags Bottom 10 Product Categories')
plt.xlabel('Count')
plt.ylabel('Subcategory')
plt.show()

# Display the bottom 10 subcategories and their counts
print(bottom_10_subcategories)
subcategory
Sacs de voyage                  11
Cosmetic Bags                   10
Étui & Sac des monnaies          7
Men's Bags                       7
Sacs de rangement & Trousses     6
Pochettes                        5
Porte-documents                  3
Pochettes & Clutches             1
Sacs cosmétiques                 1
Bag Accessories                  1
Name: count, dtype: int64
In [323]:
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_bags.groupby('subcategory')['likes_count'].sum().sort_values(ascending=True)

# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]

# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Bags with Least Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()

# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory
Pochettes & Clutches               8
Bag Accessories                  179
Porte-documents                  323
Men's Bags                       450
Sacs de rangement & Trousses    1218
Pochettes                       1361
Étui & Sac des monnaies         1751
Sacs chic                       2597
Cosmetic Bags                   3743
Sacs de voyage                  7116
Name: likes_count, dtype: int64
In [324]:
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_bags.groupby('subcategory')['likes_count'].sum().sort_values(ascending=False)

# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]

# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Bags with Highest Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()

# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory
Sac bandoulière            56711
Sacs à main                31328
Portefeuilles              20123
Sacs à dos                 16607
Sacs cosmétiques            7266
Sacs de voyage              7116
Cosmetic Bags               3743
Sacs chic                   2597
Étui & Sac des monnaies     1751
Pochettes                   1361
Name: likes_count, dtype: int64
In [177]:
# Plot: Distribution of current prices
plt.figure(figsize=(10, 6))
sns.histplot(test_set_bags['current_price'], kde=True, bins=20, color='blue')
plt.title('Distribution of Current Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()
In [178]:
# Plot: Scatter plot of discount vs. likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='discount', y='likes_count', data=test_set_bags, color='red')
plt.title('bags Discount vs. Likes Count')
plt.xlabel('Discount (%)')
plt.ylabel('Likes Count')
plt.show()
In [179]:
import pandas as pd
import matplotlib.pyplot as plt


# Extract discount and likes_count columns
discount = test_set_bags['discount']
likes_count = test_set_bags['likes_count']

# Calculate the midpoints for discount and likes_count
mid_discount = (discount.max() + discount.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='blue', alpha=0.3)

# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at the midpoint of likes
plt.axvline(mid_discount, color='black', linewidth=3, linestyle='--')  # Vertical line at the midpoint of discount

# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Equal Quadrants by Mid Value')
plt.xlabel('Discount')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mid_discount + (mid_discount * 0.05), mid_likes + (mid_likes * 0.05), '   Q1', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes + (mid_likes * 0.05), '    Q2', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes - (mid_likes * 0.4), '     Q3', fontsize=14, color='red')
plt.text(mid_discount + (mid_discount * 0.05), mid_likes - (mid_likes * 0.4), '    Q4', fontsize=14, color='red')

# Show the plot
plt.show()

# Calculate the mean for discount and likes_count
mean_discount = discount.mean()
mean_likes = likes_count.mean()

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='green', alpha=0.3)

# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at mean of likes
plt.axvline(mean_discount, color='black', linewidth=3, linestyle='--')  # Vertical line at mean of price

# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Four Quadrants by Mean')
plt.xlabel('discount')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mean_discount + (mean_discount * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_discount + (mean_discount * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')

# Show the plot
plt.show()
In [180]:
# Identify continuous variables
continuous_vars = ['current_price', 'raw_price', 'discount', 'likes_count']

# Create box plots for each continuous variable
plt.figure(figsize=(15, 10))
for i, var in enumerate(continuous_vars):
    plt.subplot(2, 2, i + 1)
    sns.boxplot(y=test_set_bags[var])
    plt.title(f'Box Plot of {var}')

plt.tight_layout()
plt.show()
In [181]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'data_cleaned' is your DataFrame

# Define a function to cap outliers using min and max values
def cap_outliers(df, column):
  Q1 = df[column].quantile(0.25)
  Q3 = df[column].quantile(0.75)
  IQR = Q3 - Q1
  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR
  df[column] = df[column].clip(lower_bound, upper_bound)
  return df

# Apply the function to the columns with outliers
for column in ['current_price', 'raw_price', 'discount', 'likes_count']:
  test_set_bags = cap_outliers(test_set_bags, column)


# Create box plots for each continuous variable after outlier treatment
plt.figure(figsize=(15, 10))
for i, var in enumerate(['current_price', 'raw_price', 'discount', 'likes_count']):
  plt.subplot(2, 2, i + 1)
  sns.boxplot(y=test_set_bags[var])
  plt.title(f'Box Plot of {var} (Outliers Treated)')

plt.tight_layout()
plt.show()
In [182]:
import sklearn
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Identify non-continuous (categorical) variables
categorical_vars = ['subcategory','name']

# Apply label encoding to each categorical variable
for var in categorical_vars:
  test_set_bags[var] = label_encoder.fit_transform(test_set_bags[var])

test_set_bags.head()
Out[182]:
category subcategory name current_price raw_price discount likes_count
3158 bags 7 147 34.99 75.00 53.0 2.0
1345 bags 14 215 8.99 18.00 50.0 6.0
6110 bags 7 115 7.45 21.83 66.0 195.0
5532 bags 7 496 22.39 108.99 78.5 106.0
2521 bags 6 234 9.80 20.57 52.0 165.0
In [183]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'data_cleaned' is your DataFrame

# Create discount bins
test_set_bags['discount_bin'] = pd.cut(test_set_bags['discount'], bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                                      labels=['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100'])

# Calculate mean likes per discount bin
discount_likes = test_set_bags.groupby('discount_bin')['likes_count'].mean().sort_values()

# Plotting the effect of discount on popularity
plt.figure(figsize=(10, 6))
sns.barplot(x=discount_likes.index, y=discount_likes.values, palette='coolwarm')
plt.title('Effect of Discount on Likes Count for bags')
plt.xlabel('Discount Range (%)')
plt.ylabel('Average Likes Count')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.tight_layout()  # Adjust layout to prevent labels from overlapping
plt.show()
In [184]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'data_cleaned' is your DataFrame

# Plotting the relationship between price and likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='current_price', y='likes_count', data=test_set_bags, color='blue', alpha=0.6)
plt.title('Relationship Between Price and Likes Count for bags')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
plt.show()
In [185]:
import pandas as pd
import matplotlib.pyplot as plt


# Extract current_price and likes_count columns
current_price = test_set_bags['current_price']
likes_count = test_set_bags['likes_count']

# Calculate the midpoints for current_price and likes_count
mid_price = (current_price.max() + current_price.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='blue', alpha=0.3)

# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at the midpoint of likes
plt.axvline(mid_price, color='black', linewidth=3, linestyle='--')  # Vertical line at the midpoint of price

# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Equal Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mid_price + (mid_price * 0.05), mid_likes + (mid_likes * 0.05), 'Q1', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes + (mid_likes * 0.05), 'Q2', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes - (mid_likes * 0.4), 'Q3', fontsize=14, color='red')
plt.text(mid_price + (mid_price * 0.05), mid_likes - (mid_likes * 0.4), 'Q4', fontsize=14, color='red')

# Show the plot
plt.show()

# Calculate the mean for current_price and likes_count
mean_price = current_price.mean()
mean_likes = likes_count.mean()

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='green', alpha=0.3)

# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at mean of likes
plt.axvline(mean_price, color='black', linewidth=3, linestyle='--')  # Vertical line at mean of price

# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Four Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mean_price + (mean_price * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_price + (mean_price * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')

# Show the plot
plt.show()
In [186]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

# Select features for clustering (price and discount)
X = test_set_bags[['current_price', 'discount']]

# Perform K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
test_set_bags['cluster'] = kmeans.fit_predict(X)

# Calculate average likes per cluster
cluster_likes = test_set_bags.groupby('cluster')['likes_count'].mean()

# Plot average likes per cluster
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

# Assuming 'data_cleaned' is your DataFrame

# Select features for clustering (price and discount)
X = test_set_bags[['current_price', 'discount']]

# Perform K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=42)  # Adjust n_clusters as needed
test_set_bags['cluster'] = kmeans.fit_predict(X)

# Calculate average likes per cluster
cluster_likes = test_set_bags.groupby('cluster')['likes_count'].mean()

# Plot average likes per cluster
plt.figure(figsize=(10, 6))
sns.barplot(x=cluster_likes.index, y=cluster_likes.values, palette='Set2')
plt.title('Average Likes per Cluster (Based on Price and Discount)')
plt.xlabel('Cluster')
plt.ylabel('Average Likes')
plt.show()
C:\Users\santa\AppData\Roaming\Python\Python311\site-packages\sklearn\cluster\_kmeans.py:1429: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3.
  warnings.warn(
C:\Users\santa\AppData\Roaming\Python\Python311\site-packages\sklearn\cluster\_kmeans.py:1429: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3.
  warnings.warn(

Beauty¶

In [325]:
beauty = pd.read_csv('beauty.csv')
In [326]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
beauty = pd.read_csv('beauty.csv')

# Split the dataset
train_set_beauty, test_set_beauty = train_test_split(beauty, test_size=0.1, random_state=42)

# Display the number of rows in each set
print("Training set beauty size:", len(train_set_beauty))
print("Testing set beauty size:", len(test_set_beauty))
Training set beauty size: 3423
Testing set beauty size: 381

Beauty and Random 10%¶

In [327]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style for the plots
sns.set(style="whitegrid")

# Plot: Distribution of product categories
plt.figure(figsize=(10, 6))
sns.countplot(y='subcategory', data=test_set_beauty, order=test_set_beauty['subcategory'].value_counts().index[:10], palette='viridis')
plt.title('Top 10 Beauty')
plt.xlabel('Count')
plt.ylabel('subcategory')
plt.show()

# Get the counts of each subcategory
subcategory_counts = test_set_beauty['subcategory'].value_counts()

# Get the Top 10 subcategories
top_10_subcategories = subcategory_counts.head(10)
total_subcategories = subcategory_counts.sum()

# Display the Top 10 subcategories and their counts
print(top_10_subcategories)
print('Total beauty             ',total_subcategories)
subcategory
Soin visage                19
Fard à paupières           13
Perruques synthétiques     13
Vêtements minceur          12
Trousses                   11
Vernis à ongles            11
Soins des pieds            11
Autres outils              10
Sports Equipments          10
Accessoires soin visage    10
Name: count, dtype: int64
Total beauty              381
In [328]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Assuming 'data_cleaned' is your DataFrame

# Get the counts of each subcategory
subcategory_counts = test_set_beauty['subcategory'].value_counts()

# Get the bottom 10 subcategories
bottom_10_subcategories = subcategory_counts.tail(10)

# Set the style for the plots
sns.set(style="whitegrid")

# Plot: Distribution of bottom 10 product categories
plt.figure(figsize=(10, 6))
sns.countplot(y=test_set_beauty['subcategory'],
              order=bottom_10_subcategories.index,
              palette='viridis')
plt.title(' Beauty Bottom 10 Product Categories')
plt.xlabel('Count')
plt.ylabel('Subcategory')
plt.show()

# Display the bottom 10 subcategories and their counts
print(bottom_10_subcategories)
subcategory
Stylo rouge à lèvres                    1
Démaquillage                            1
Soins du visage                         1
Pochoirs pour ongles                    1
Oils                                    1
Polissoir pour ongles                   1
Colle à ongles                          1
Sèche-cheveux                           1
Déodorants & Anhidrotiques              1
Organisateur cosmétique en acrylique    1
Name: count, dtype: int64
In [329]:
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_beauty.groupby('subcategory')['likes_count'].sum().sort_values(ascending=True)

# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]

# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Beauty with Least Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()

# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory
Polissoir pour ongles               10
Spa & Aromathérapie & Diffuseurs    33
Pochoirs pour ongles                37
Sèche-cheveux                       59
Démaquillage                        60
Daily Necessities                   77
Lisseurs des cheveux                79
Déodorants & Anhidrotiques          80
Soins des mains                     85
Ciseaux & Cisailles de coiffure     91
Name: likes_count, dtype: int64
In [330]:
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_beauty.groupby('subcategory')['likes_count'].sum().sort_values(ascending=False)

# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]

# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Beauty with Highest Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()

# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory
Gloss à lèvres                           8936
Vêtements minceur                        3670
Correcteur                               2957
Soin visage                              2932
Faux cils                                2667
Soins des pieds                          2508
Rouge à lèvres                           2272
Traitement des Cheveux & Cuir chevelu    2206
Trousses                                 1965
Vernis à ongles                          1655
Name: likes_count, dtype: int64
In [331]:
# Plot: Distribution of current prices
plt.figure(figsize=(12, 8))
sns.histplot(test_set_beauty['current_price'], kde=True, bins=75, color='blue')
plt.title('Distribution of Current Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()
In [194]:
# Plot: Scatter plot of discount vs. likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='discount', y='likes_count', data=test_set_beauty, color='red')
plt.title('beauty Discount vs. Likes Count')
plt.xlabel('Discount (%)')
plt.ylabel('Likes Count')
plt.show()
In [195]:
import pandas as pd
import matplotlib.pyplot as plt


# Extract discount and likes_count columns
discount = test_set_beauty['discount']
likes_count = test_set_beauty['likes_count']

# Calculate the midpoints for discount and likes_count
mid_discount = (discount.max() + discount.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='blue', alpha=0.3)

# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at the midpoint of likes
plt.axvline(mid_discount, color='black', linewidth=3, linestyle='--')  # Vertical line at the midpoint of discount

# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Equal Quadrants by Mid Value')
plt.xlabel('Discount')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mid_discount + (mid_discount * 0.05), mid_likes + (mid_likes * 0.05), '   Q1', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes + (mid_likes * 0.05), '    Q2', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes - (mid_likes * 0.4), '     Q3', fontsize=14, color='red')
plt.text(mid_discount + (mid_discount * 0.05), mid_likes - (mid_likes * 0.4), '    Q4', fontsize=14, color='red')

# Show the plot
plt.show()

# Calculate the mean for discount and likes_count
mean_discount = discount.mean()
mean_likes = likes_count.mean()

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='green', alpha=0.3)

# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at mean of likes
plt.axvline(mean_discount, color='black', linewidth=3, linestyle='--')  # Vertical line at mean of price

# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Four Quadrants by Mean')
plt.xlabel('discount')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mean_discount + (mean_discount * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_discount + (mean_discount * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')

# Show the plot
plt.show()
In [196]:
# Identify continuous variables
continuous_vars = ['current_price', 'raw_price', 'discount', 'likes_count']

# Create box plots for each continuous variable
plt.figure(figsize=(15, 10))
for i, var in enumerate(continuous_vars):
    plt.subplot(2, 2, i + 1)
    sns.boxplot(y=test_set_beauty[var])
    plt.title(f'Box Plot of {var}')

plt.tight_layout()
plt.show()
In [197]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'data_cleaned' is your DataFrame

# Define a function to cap outliers using min and max values
def cap_outliers(df, column):
  Q1 = df[column].quantile(0.25)
  Q3 = df[column].quantile(0.75)
  IQR = Q3 - Q1
  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR
  df[column] = df[column].clip(lower_bound, upper_bound)
  return df

# Apply the function to the columns with outliers
for column in ['current_price', 'raw_price', 'discount', 'likes_count']:
  test_set_beauty = cap_outliers(test_set_beauty, column)


# Create box plots for each continuous variable after outlier treatment
plt.figure(figsize=(15, 10))
for i, var in enumerate(['current_price', 'raw_price', 'discount', 'likes_count']):
  plt.subplot(2, 2, i + 1)
  sns.boxplot(y=test_set_beauty[var])
  plt.title(f'Box Plot of {var} (Outliers Treated)')

plt.tight_layout()
plt.show()
In [198]:
import sklearn
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Identify non-continuous (categorical) variables
categorical_vars = ['subcategory','name']

# Apply label encoding to each categorical variable
for var in categorical_vars:
  test_set_beauty[var] = label_encoder.fit_transform(test_set_beauty[var])

test_set_beauty.head()
Out[198]:
category subcategory name current_price raw_price discount likes_count
2014 beauty 21 274 51.19 89.990 43.0 42.0
527 beauty 40 41 30.46 52.730 42.0 391.5
478 beauty 42 297 51.19 103.965 50.0 32.0
1684 beauty 6 177 9.19 18.660 51.0 33.0
1653 beauty 53 55 15.12 30.450 50.0 310.0
In [199]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'data_cleaned' is your DataFrame

# Create discount bins
test_set_beauty['discount_bin'] = pd.cut(test_set_beauty['discount'], bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                                      labels=['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100'])

# Calculate mean likes per discount bin
discount_likes = test_set_beauty.groupby('discount_bin')['likes_count'].mean().sort_values()

# Plotting the effect of discount on popularity
plt.figure(figsize=(10, 6))
sns.barplot(x=discount_likes.index, y=discount_likes.values, palette='coolwarm')
plt.title('Effect of Discount on Likes Count for beauty')
plt.xlabel('Discount Range (%)')
plt.ylabel('Average Likes Count')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.tight_layout()  # Adjust layout to prevent labels from overlapping
plt.show()
In [200]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'data_cleaned' is your DataFrame

# Plotting the relationship between price and likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='current_price', y='likes_count', data=test_set_beauty, color='blue', alpha=0.6)
plt.title('Relationship Between Price and Likes Count for beauty')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
plt.show()
In [201]:
import pandas as pd
import matplotlib.pyplot as plt


# Extract current_price and likes_count columns
current_price = test_set_beauty['current_price']
likes_count = test_set_beauty['likes_count']

# Calculate the midpoints for current_price and likes_count
mid_price = (current_price.max() + current_price.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='blue', alpha=0.3)

# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at the midpoint of likes
plt.axvline(mid_price, color='black', linewidth=3, linestyle='--')  # Vertical line at the midpoint of price

# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Equal Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mid_price + (mid_price * 0.05), mid_likes + (mid_likes * 0.05), 'Q1', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes + (mid_likes * 0.05), 'Q2', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes - (mid_likes * 0.4), 'Q3', fontsize=14, color='red')
plt.text(mid_price + (mid_price * 0.05), mid_likes - (mid_likes * 0.4), 'Q4', fontsize=14, color='red')

# Show the plot
plt.show()

# Calculate the mean for current_price and likes_count
mean_price = current_price.mean()
mean_likes = likes_count.mean()

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='green', alpha=0.3)

# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at mean of likes
plt.axvline(mean_price, color='black', linewidth=3, linestyle='--')  # Vertical line at mean of price

# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Four Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mean_price + (mean_price * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_price + (mean_price * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')

# Show the plot
plt.show()
In [202]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

# Select features for clustering (price and discount)
X = test_set_beauty[['current_price', 'discount']]

# Perform K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
test_set_beauty['cluster'] = kmeans.fit_predict(X)

# Calculate average likes per cluster
cluster_likes = test_set_beauty.groupby('cluster')['likes_count'].mean()

# Plot average likes per cluster
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

# Assuming 'data_cleaned' is your DataFrame

# Select features for clustering (price and discount)
X = test_set_beauty[['current_price', 'discount']]

# Perform K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42)  # Adjust n_clusters as needed
test_set_beauty['cluster'] = kmeans.fit_predict(X)

# Calculate average likes per cluster
cluster_likes = test_set_beauty.groupby('cluster')['likes_count'].mean()

# Plot average likes per cluster
plt.figure(figsize=(10, 6))
sns.barplot(x=cluster_likes.index, y=cluster_likes.values, palette='Set2')
plt.title('Average Likes per Cluster (Based on Price and Discount)')
plt.xlabel('Cluster')
plt.ylabel('Average Likes')
plt.show()
C:\Users\santa\AppData\Roaming\Python\Python311\site-packages\sklearn\cluster\_kmeans.py:1429: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2.
  warnings.warn(
C:\Users\santa\AppData\Roaming\Python\Python311\site-packages\sklearn\cluster\_kmeans.py:1429: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2.
  warnings.warn(

House¶

In [332]:
house = pd.read_csv('house.csv')
In [333]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
house = pd.read_csv('house.csv')

# Split the dataset
train_set_house, test_set_house = train_test_split(house, test_size=0.1, random_state=42)

# Display the number of rows in each set
print("Training set house size:", len(train_set_house))
print("Testing set house size:", len(test_set_house))
Training set house size: 11511
Testing set house size: 1280

House and Random 10%¶

In [335]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style for the plots
sns.set(style="whitegrid")

# Plot: Distribution of product categories
plt.figure(figsize=(10, 6))
sns.countplot(y='subcategory', data=test_set_house, order=test_set_house['subcategory'].value_counts().index[:10], palette='viridis')
plt.title('Top 10 Household items')
plt.xlabel('Count')
plt.ylabel('subcategory')
plt.show()

# Get the counts of each subcategory
subcategory_counts = test_set_house['subcategory'].value_counts()

# Get the Top 10 subcategories
top_10_subcategories = subcategory_counts.head(10)
total_subcategories = subcategory_counts.sum()

# Display the Top 10 subcategories and their counts
print(top_10_subcategories)
print('Total house               ',total_subcategories)
subcategory
Housses de coussin               66
Jouets Squishy                   52
Sacs de voyage & shopping        29
Sacs d'organisation de maison    26
Literie                          25
Coussins & Oreillers             22
Autocollants de murs             21
Boîte de stockage                21
Sacs de ligne & cosmétique       20
Flowers                          19
Name: count, dtype: int64
Total house                1280
In [206]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Assuming 'data_cleaned' is your DataFrame

# Get the counts of each subcategory
subcategory_counts = test_set_house['subcategory'].value_counts()

# Get the bottom 10 subcategories
bottom_10_subcategories = subcategory_counts.tail(10)

# Set the style for the plots
sns.set(style="whitegrid")

# Plot: Distribution of bottom 10 product categories
plt.figure(figsize=(10, 6))
sns.countplot(y=test_set_house['subcategory'],
              order=bottom_10_subcategories.index,
              palette='viridis')
plt.title('Household Bottom 10 Product Categories')
plt.xlabel('Count')
plt.ylabel('Subcategory')
plt.show()

# Display the bottom 10 subcategories and their counts
print(bottom_10_subcategories)
subcategory
Tongs                        1
Sécurité de salle de bain    1
Fournitures d'envoi          1
Pailles                      1
Alarme Smart                 1
Maisons & Cages              1
Terrariums                   1
Tool Sets                    1
Lampes de toilette           1
Parapluies & Parasol         1
Name: count, dtype: int64
In [336]:
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_house.groupby('subcategory')['likes_count'].sum().sort_values(ascending=False)

# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]

# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Household with Highest Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()

# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory
Flowers                          12510
Jouets Squishy                   10215
Sacs d'organisation de maison     8753
Couture                           7294
Housses de coussin                7025
Sacs de voyage & shopping         5551
Literie                           4613
Coussins & Oreillers              4264
Fruits                            4194
Home Carpets                      3773
Name: likes_count, dtype: int64
In [337]:
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_house.groupby('subcategory')['likes_count'].sum().sort_values(ascending=True)

# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]

# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Household with Least Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()

# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory
Lighting Accessories               2
Ventilateur                        7
Vêtements de pluie & Parapluie     8
Arts & Artisanat & Couture         8
Pailles                            9
Décoration de soirée               9
Ballons & Accessoires              9
Pots & Planters                   11
Terrariums                        12
Sacs & Boîtes de bonbons          13
Name: likes_count, dtype: int64
In [338]:
# Plot: Distribution of current prices
plt.figure(figsize=(10, 6))
sns.histplot(test_set_house['current_price'], kde=True, bins=200, color='blue')
plt.title('Distribution of Current Prices')
plt.xlabel('Current Price')
plt.ylabel('Frequency')
plt.show()
In [210]:
# Plot: Scatter plot of discount vs. likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='discount', y='likes_count', data=test_set_house, color='red')
plt.title('house Discount vs. Likes Count')
plt.xlabel('Discount (%)')
plt.ylabel('Likes Count')
plt.show()
In [211]:
import pandas as pd
import matplotlib.pyplot as plt


# Extract discount and likes_count columns
discount = test_set_house['discount']
likes_count = test_set_house['likes_count']

# Calculate the midpoints for discount and likes_count
mid_discount = (discount.max() + discount.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='blue', alpha=0.3)

# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at the midpoint of likes
plt.axvline(mid_discount, color='black', linewidth=3, linestyle='--')  # Vertical line at the midpoint of discount

# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Equal Quadrants by Mid Value')
plt.xlabel('Discount')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mid_discount + (mid_discount * 0.05), mid_likes + (mid_likes * 0.05), '   Q1', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes + (mid_likes * 0.05), '    Q2', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes - (mid_likes * 0.4), '     Q3', fontsize=14, color='red')
plt.text(mid_discount + (mid_discount * 0.05), mid_likes - (mid_likes * 0.4), '    Q4', fontsize=14, color='red')

# Show the plot
plt.show()

# Calculate the mean for discount and likes_count
mean_discount = discount.mean()
mean_likes = likes_count.mean()

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='green', alpha=0.3)

# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at mean of likes
plt.axvline(mean_discount, color='black', linewidth=3, linestyle='--')  # Vertical line at mean of price

# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Four Quadrants by Mean')
plt.xlabel('discount')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mean_discount + (mean_discount * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_discount + (mean_discount * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')

# Show the plot
plt.show()
In [212]:
# Identify continuous variables
continuous_vars = ['current_price', 'raw_price', 'discount', 'likes_count']

# Create box plots for each continuous variable
plt.figure(figsize=(15, 10))
for i, var in enumerate(continuous_vars):
    plt.subplot(2, 2, i + 1)
    sns.boxplot(y=test_set_house[var])
    plt.title(f'Box Plot of {var}')

plt.tight_layout()
plt.show()
In [213]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'data_cleaned' is your DataFrame

# Define a function to cap outliers using min and max values
def cap_outliers(df, column):
  Q1 = df[column].quantile(0.25)
  Q3 = df[column].quantile(0.75)
  IQR = Q3 - Q1
  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR
  df[column] = df[column].clip(lower_bound, upper_bound)
  return df

# Apply the function to the columns with outliers
for column in ['current_price', 'raw_price', 'discount', 'likes_count']:
  test_set_house = cap_outliers(test_set_house, column)


# Create box plots for each continuous variable after outlier treatment
plt.figure(figsize=(15, 10))
for i, var in enumerate(['current_price', 'raw_price', 'discount', 'likes_count']):
  plt.subplot(2, 2, i + 1)
  sns.boxplot(y=test_set_house[var])
  plt.title(f'Box Plot of {var} (Outliers Treated)')

plt.tight_layout()
plt.show()
In [214]:
import sklearn
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Identify non-continuous (categorical) variables
categorical_vars = ['subcategory','name']

# Apply label encoding to each categorical variable
for var in categorical_vars:
  test_set_house[var] = label_encoder.fit_transform(test_set_house[var])

test_set_house.head()
Out[214]:
category subcategory name current_price raw_price discount likes_count
4091 house 174 544 5.76 20.59 72 27.0
3921 house 204 109 3.29 7.11 54 211.0
1355 house 8 490 24.99 43.99 43 31.0
6109 house 21 658 26.80 60.92 56 61.0
4132 house 135 84 4.79 9.99 52 57.0
In [215]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'data_cleaned' is your DataFrame

# Create discount bins
test_set_house['discount_bin'] = pd.cut(test_set_house['discount'], bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                                      labels=['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100'])

# Calculate mean likes per discount bin
discount_likes = test_set_house.groupby('discount_bin')['likes_count'].mean().sort_values()

# Plotting the effect of discount on popularity
plt.figure(figsize=(10, 6))
sns.barplot(x=discount_likes.index, y=discount_likes.values, palette='coolwarm')
plt.title('Effect of Discount on Likes Count for house')
plt.xlabel('Discount Range (%)')
plt.ylabel('Average Likes Count')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.tight_layout()  # Adjust layout to prevent labels from overlapping
plt.show()
In [216]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'data_cleaned' is your DataFrame

# Plotting the relationship between price and likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='current_price', y='likes_count', data=test_set_house, color='blue', alpha=0.6)
plt.title('Relationship Between Price and Likes Count for house')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
plt.show()
In [217]:
import pandas as pd
import matplotlib.pyplot as plt


# Extract current_price and likes_count columns
current_price = test_set_house['current_price']
likes_count = test_set_house['likes_count']

# Calculate the midpoints for current_price and likes_count
mid_price = (current_price.max() + current_price.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='blue', alpha=0.3)

# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at the midpoint of likes
plt.axvline(mid_price, color='black', linewidth=3, linestyle='--')  # Vertical line at the midpoint of price

# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Equal Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mid_price + (mid_price * 0.05), mid_likes + (mid_likes * 0.05), 'Q1', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes + (mid_likes * 0.05), 'Q2', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes - (mid_likes * 0.4), 'Q3', fontsize=14, color='red')
plt.text(mid_price + (mid_price * 0.05), mid_likes - (mid_likes * 0.4), 'Q4', fontsize=14, color='red')

# Show the plot
plt.show()

# Calculate the mean for current_price and likes_count
mean_price = current_price.mean()
mean_likes = likes_count.mean()

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='green', alpha=0.3)

# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at mean of likes
plt.axvline(mean_price, color='black', linewidth=3, linestyle='--')  # Vertical line at mean of price

# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Four Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mean_price + (mean_price * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_price + (mean_price * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')

# Show the plot
plt.show()
In [218]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

# Select features for clustering (price and discount)
X = test_set_house[['current_price', 'discount']]

# Perform K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
test_set_house['cluster'] = kmeans.fit_predict(X)

# Calculate average likes per cluster
cluster_likes = test_set_house.groupby('cluster')['likes_count'].mean()

# Plot average likes per cluster
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

# Assuming 'data_cleaned' is your DataFrame

# Select features for clustering (price and discount)
X = test_set_house[['current_price', 'discount']]

# Perform K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42)  # Adjust n_clusters as needed
test_set_house['cluster'] = kmeans.fit_predict(X)

# Calculate average likes per cluster
cluster_likes = test_set_house.groupby('cluster')['likes_count'].mean()

# Plot average likes per cluster
plt.figure(figsize=(10, 6))
sns.barplot(x=cluster_likes.index, y=cluster_likes.values, palette='Set2')
plt.title('Average Likes per Cluster (Based on Price and Discount)')
plt.xlabel('Cluster')
plt.ylabel('Average Likes')
plt.show()

Jewelry¶

In [341]:
jewelry = pd.read_csv('jewelry.csv')
In [342]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
jewelry = pd.read_csv('jewelry.csv')

# Split the dataset
train_set_jewelry, test_set_jewelry = train_test_split(jewelry, test_size=0.1, random_state=42)

# Display the number of rows in each set
print("Training set jewelry size:", len(train_set_jewelry))
print("Testing set jewelry size:", len(test_set_jewelry))
Training set jewelry size: 4367
Testing set jewelry size: 486

Jewelry and Random 10%¶

In [343]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style for the plots
sns.set(style="whitegrid")

# Plot: Distribution of product categories
plt.figure(figsize=(10, 6))
sns.countplot(y='subcategory', data=test_set_jewelry, order=test_set_jewelry['subcategory'].value_counts().index[:10], palette='viridis')
plt.title('Top 10 Jewelry ')
plt.xlabel('Count')
plt.ylabel('subcategory')
plt.show()

# Get the counts of each subcategory
subcategory_counts = test_set_jewelry['subcategory'].value_counts()

# Get the Top 10 subcategories
top_10_subcategories = subcategory_counts.head(10)
total_subcategories = subcategory_counts.sum()

# Display the Top 10 subcategories and their counts
print(top_10_subcategories)
print('Total jewelry             ',total_subcategories)
subcategory
Boucles d'oreilles         112
Colliers                    78
Bracelets                   52
Bagues                      50
Montres pour homme          35
Montres pour femme          27
Accessoires des cheveux     16
Sets de bijoux              16
Montres connectées          13
Bracelets pour homme        12
Name: count, dtype: int64
Total jewelry              486
In [344]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Assuming 'data_cleaned' is your DataFrame

# Get the counts of each subcategory
subcategory_counts = test_set_jewelry['subcategory'].value_counts()

# Get the bottom 10 subcategories
bottom_10_subcategories = subcategory_counts.tail(10)

# Set the style for the plots
sns.set(style="whitegrid")

# Plot: Distribution of bottom 10 product categories
plt.figure(figsize=(10, 6))
sns.countplot(y=test_set_jewelry['subcategory'],
              order=bottom_10_subcategories.index,
              palette='viridis')
plt.title('Jewelery Bottom 10 Product Categories')
plt.xlabel('Count')
plt.ylabel('Subcategory')
plt.show()

# Display the bottom 10 subcategories and their counts
print(bottom_10_subcategories)
subcategory
Fine Copper2                     2
Boucles d'oreilles pour homme    2
Coffrets & Sacs de bijoux        2
Colliers de couple               2
Présentoirs de bijoux            2
Accessoires                      2
Mascarade & Cosplay              2
Montres couple                   1
Montres de poche                 1
Bagues de couple                 1
Name: count, dtype: int64
In [345]:
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_jewelry.groupby('subcategory')['likes_count'].sum().sort_values(ascending=True)

# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]

# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Jewelry with Least Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()

# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory
Mascarade & Cosplay               72
Accessoires                       85
Montres de poche                  87
Montres couple                   121
Bijoux de corps                  193
Présentoirs de bijoux            226
Boucles d'oreilles pour homme    280
Coffrets & Sacs de bijoux        289
Colliers de couple               346
Broches et épingles              358
Name: likes_count, dtype: int64
In [346]:
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_jewelry.groupby('subcategory')['likes_count'].sum().sort_values(ascending=False)

# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]

# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Jewelry with Highest Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()

# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory
Boucles d'oreilles         20337
Colliers                   18895
Bagues                     11507
Bracelets                   7153
Montres pour homme          3786
Bagues pour homme           3319
Montres pour femme          2991
Accessoires des cheveux     2583
Colliers pour homme         2184
Bracelets de cheville       1758
Name: likes_count, dtype: int64
In [347]:
# Plot: Distribution of current prices
plt.figure(figsize=(10, 6))
sns.histplot(test_set_jewelry['current_price'], kde=True, bins=100, color='blue')
plt.title('Distribution of Current Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()
In [226]:
# Plot: Scatter plot of discount vs. likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='discount', y='likes_count', data=test_set_jewelry, color='red')
plt.title('jewelry Discount vs. Likes Count')
plt.xlabel('Discount (%)')
plt.ylabel('Likes Count')
plt.show()
In [227]:
import pandas as pd
import matplotlib.pyplot as plt


# Extract discount and likes_count columns
discount = test_set_jewelry['discount']
likes_count = test_set_jewelry['likes_count']

# Calculate the midpoints for discount and likes_count
mid_discount = (discount.max() + discount.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='blue', alpha=0.3)

# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at the midpoint of likes
plt.axvline(mid_discount, color='black', linewidth=3, linestyle='--')  # Vertical line at the midpoint of discount

# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Equal Quadrants by Mid Value')
plt.xlabel('Discount')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mid_discount + (mid_discount * 0.05), mid_likes + (mid_likes * 0.05), '   Q1', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes + (mid_likes * 0.05), '    Q2', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes - (mid_likes * 0.4), '     Q3', fontsize=14, color='red')
plt.text(mid_discount + (mid_discount * 0.05), mid_likes - (mid_likes * 0.4), '    Q4', fontsize=14, color='red')

# Show the plot
plt.show()

# Calculate the mean for discount and likes_count
mean_discount = discount.mean()
mean_likes = likes_count.mean()

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='green', alpha=0.3)

# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at mean of likes
plt.axvline(mean_discount, color='black', linewidth=3, linestyle='--')  # Vertical line at mean of price

# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Four Quadrants by Mean')
plt.xlabel('discount')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mean_discount + (mean_discount * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_discount + (mean_discount * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')

# Show the plot
plt.show()
In [228]:
# Identify continuous variables
continuous_vars = ['current_price', 'raw_price', 'discount', 'likes_count']

# Create box plots for each continuous variable
plt.figure(figsize=(15, 10))
for i, var in enumerate(continuous_vars):
    plt.subplot(2, 2, i + 1)
    sns.boxplot(y=test_set_jewelry[var])
    plt.title(f'Box Plot of {var}')

plt.tight_layout()
plt.show()
In [229]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'data_cleaned' is your DataFrame

# Define a function to cap outliers using min and max values
def cap_outliers(df, column):
  Q1 = df[column].quantile(0.25)
  Q3 = df[column].quantile(0.75)
  IQR = Q3 - Q1
  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR
  df[column] = df[column].clip(lower_bound, upper_bound)
  return df

# Apply the function to the columns with outliers
for column in ['current_price', 'raw_price', 'discount', 'likes_count']:
  test_set_jewelry = cap_outliers(test_set_jewelry, column)


# Create box plots for each continuous variable after outlier treatment
plt.figure(figsize=(15, 10))
for i, var in enumerate(['current_price', 'raw_price', 'discount', 'likes_count']):
  plt.subplot(2, 2, i + 1)
  sns.boxplot(y=test_set_jewelry[var])
  plt.title(f'Box Plot of {var} (Outliers Treated)')

plt.tight_layout()
plt.show()
In [230]:
import sklearn
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Identify non-continuous (categorical) variables
categorical_vars = ['subcategory','name']

# Apply label encoding to each categorical variable
for var in categorical_vars:
  test_set_jewelry[var] = label_encoder.fit_transform(test_set_jewelry[var])

test_set_jewelry.head()
Out[230]:
category subcategory name current_price raw_price discount likes_count
3209 jewelry 2 317 10.66 25.39000 58 58.0
1684 jewelry 12 481 9.75 19.82000 51 185.0
1044 jewelry 9 206 11.49 18.74000 39 35.0
4813 jewelry 24 417 25.08 47.98625 49 118.0
1538 jewelry 23 437 25.49 47.98625 50 64.0
In [231]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'data_cleaned' is your DataFrame

# Create discount bins
test_set_jewelry['discount_bin'] = pd.cut(test_set_jewelry['discount'], bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                                      labels=['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100'])

# Calculate mean likes per discount bin
discount_likes = test_set_jewelry.groupby('discount_bin')['likes_count'].mean().sort_values()

# Plotting the effect of discount on popularity
plt.figure(figsize=(10, 6))
sns.barplot(x=discount_likes.index, y=discount_likes.values, palette='coolwarm')
plt.title('Effect of Discount on Likes Count for beauty')
plt.xlabel('Discount Range (%)')
plt.ylabel('Average Likes Count')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.tight_layout()  # Adjust layout to prevent labels from overlapping
plt.show()
In [232]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'data_cleaned' is your DataFrame

# Plotting the relationship between price and likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='current_price', y='likes_count', data=test_set_jewelry, color='blue', alpha=0.6)
plt.title('Relationship Between Price and Likes Count for jewelry')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
plt.show()
In [233]:
import pandas as pd
import matplotlib.pyplot as plt


# Extract current_price and likes_count columns
current_price = test_set_jewelry['current_price']
likes_count = test_set_jewelry['likes_count']

# Calculate the midpoints for current_price and likes_count
mid_price = (current_price.max() + current_price.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='blue', alpha=0.3)

# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at the midpoint of likes
plt.axvline(mid_price, color='black', linewidth=3, linestyle='--')  # Vertical line at the midpoint of price

# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Equal Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mid_price + (mid_price * 0.05), mid_likes + (mid_likes * 0.05), 'Q1', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes + (mid_likes * 0.05), 'Q2', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes - (mid_likes * 0.4), 'Q3', fontsize=14, color='red')
plt.text(mid_price + (mid_price * 0.05), mid_likes - (mid_likes * 0.4), 'Q4', fontsize=14, color='red')

# Show the plot
plt.show()

# Calculate the mean for current_price and likes_count
mean_price = current_price.mean()
mean_likes = likes_count.mean()

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='green', alpha=0.3)

# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at mean of likes
plt.axvline(mean_price, color='black', linewidth=3, linestyle='--')  # Vertical line at mean of price

# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Four Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mean_price + (mean_price * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_price + (mean_price * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')

# Show the plot
plt.show()
In [234]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

# Select features for clustering (price and discount)
X = test_set_jewelry[['current_price', 'discount']]

# Perform K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
test_set_jewelry['cluster'] = kmeans.fit_predict(X)

# Calculate average likes per cluster
cluster_likes = test_set_jewelry.groupby('cluster')['likes_count'].mean()

# Plot average likes per cluster
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

# Assuming 'data_cleaned' is your DataFrame

# Select features for clustering (price and discount)
X = test_set_jewelry[['current_price', 'discount']]

# Perform K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42)  # Adjust n_clusters as needed
test_set_jewelry['cluster'] = kmeans.fit_predict(X)

# Calculate average likes per cluster
cluster_likes = test_set_jewelry.groupby('cluster')['likes_count'].mean()

# Plot average likes per cluster
plt.figure(figsize=(10, 6))
sns.barplot(x=cluster_likes.index, y=cluster_likes.values, palette='Set2')
plt.title('Average Likes per Cluster (Based on Price and Discount)')
plt.xlabel('Cluster')
plt.ylabel('Average Likes')
plt.show()
C:\Users\santa\AppData\Roaming\Python\Python311\site-packages\sklearn\cluster\_kmeans.py:1429: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2.
  warnings.warn(
C:\Users\santa\AppData\Roaming\Python\Python311\site-packages\sklearn\cluster\_kmeans.py:1429: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2.
  warnings.warn(

Kids¶

In [358]:
kids = pd.read_csv('kids.csv')
In [359]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
kids = pd.read_csv('kids.csv')

# Split the dataset
train_set_kids, test_set_kids = train_test_split(kids, test_size=0.1, random_state=42)

# Display the number of rows in each set
print("Training set kids size:", len(train_set_kids))
print("Testing set kids size:", len(test_set_kids))
Training set kids size: 3676
Testing set kids size: 409

Kids and Random 10%¶

In [360]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style for the plots
sns.set(style="whitegrid")

# Plot: Distribution of product categories
plt.figure(figsize=(10, 6))
sns.countplot(y='subcategory', data=test_set_kids, order=test_set_kids['subcategory'].value_counts().index[:10], palette='viridis')
plt.title('Top 10 Kids category products ')
plt.xlabel('Count')
plt.ylabel('subcategory')
plt.show()

# Get the counts of each subcategory
subcategory_counts = test_set_kids['subcategory'].value_counts()

# Get the Top 10 subcategories
top_10_subcategories = subcategory_counts.head(10)
total_subcategories = subcategory_counts.sum()

# Display the Top 10 subcategories and their counts
print(top_10_subcategories)
print('Total kids                  ',total_subcategories)
subcategory
Robes                        86
Brassières de grossesse      58
Costume & Jupe-culotte       34
Salopettes & Combinaisons    29
Costumes pour bébé           21
Chaussures pour fille        15
Coutumes & Co-ords           14
Tops                         14
Tops & Tees                  13
Blousons & Vestes            12
Name: count, dtype: int64
Total kids                   409
In [361]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Assuming 'data_cleaned' is your DataFrame

# Get the counts of each subcategory
subcategory_counts = test_set_kids['subcategory'].value_counts()

# Get the bottom 10 subcategories
bottom_10_subcategories = subcategory_counts.tail(10)

# Set the style for the plots
sns.set(style="whitegrid")

# Plot: Distribution of bottom 10 product categories
plt.figure(figsize=(10, 6))
sns.countplot(y=test_set_kids['subcategory'],
              order=bottom_10_subcategories.index,
              palette='viridis')
plt.title('Kids Bottom 10 Product Categories')
plt.xlabel('Count')
plt.ylabel('Subcategory')
plt.show()

# Display the bottom 10 subcategories and their counts
print(bottom_10_subcategories)
subcategory
Sacs à dos                2
Pantalons & Capris        2
Imperméables              1
Colliers & Pendentifs     1
Sandals                   1
Chaussures pour enfant    1
Slippers                  1
Trousses                  1
Chaussettes               1
Écharpes                  1
Name: count, dtype: int64
In [364]:
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_kids.groupby('subcategory')['likes_count'].sum().sort_values(ascending=True)

# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]

# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Kid with Least Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()

# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory
Pulls & Sweat-shirts       0
Slippers                   2
Trainers                   4
Sandals                    6
Flats & Loafers            9
Sneakers                  17
Écharpes                  22
Colliers & Pendentifs     23
Pantalons & Capris        44
Chaussures pour enfant    50
Name: likes_count, dtype: int64
In [365]:
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_kids.groupby('subcategory')['likes_count'].sum().sort_values(ascending=False)

# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]

# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Kid with Highest Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()

# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory
Robes                        15109
Brassières de grossesse       7471
Costume & Jupe-culotte        4186
Salopettes & Combinaisons     2635
Chaussures pour fille         1994
Coutumes & Co-ords            1559
Pantalons & Jupes             1233
Costumes pour bébé            1222
Chaussures pour garçon        1168
Trousses                      1051
Name: likes_count, dtype: int64
In [356]:
# Plot: Distribution of current prices
plt.figure(figsize=(10, 6))
sns.histplot(test_set_kids['current_price'], kde=True, bins=30, color='blue')
plt.title('Distribution of Current Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()
In [242]:
# Plot: Scatter plot of discount vs. likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='discount', y='likes_count', data=test_set_kids, color='red')
plt.title('kids Discount vs. Likes Count')
plt.xlabel('Discount (%)')
plt.ylabel('Likes Count')
plt.show()
In [243]:
import pandas as pd
import matplotlib.pyplot as plt


# Extract discount and likes_count columns
discount = test_set_kids['discount']
likes_count = test_set_kids['likes_count']

# Calculate the midpoints for discount and likes_count
mid_discount = (discount.max() + discount.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='blue', alpha=0.3)

# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at the midpoint of likes
plt.axvline(mid_discount, color='black', linewidth=3, linestyle='--')  # Vertical line at the midpoint of discount

# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Equal Quadrants by Mid Value')
plt.xlabel('Discount')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mid_discount + (mid_discount * 0.05), mid_likes + (mid_likes * 0.05), '   Q1', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes + (mid_likes * 0.05), '    Q2', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes - (mid_likes * 0.4), '     Q3', fontsize=14, color='red')
plt.text(mid_discount + (mid_discount * 0.05), mid_likes - (mid_likes * 0.4), '    Q4', fontsize=14, color='red')

# Show the plot
plt.show()

# Calculate the mean for discount and likes_count
mean_discount = discount.mean()
mean_likes = likes_count.mean()

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='green', alpha=0.3)

# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at mean of likes
plt.axvline(mean_discount, color='black', linewidth=3, linestyle='--')  # Vertical line at mean of price

# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Four Quadrants by Mean')
plt.xlabel('discount')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mean_discount + (mean_discount * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_discount + (mean_discount * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')

# Show the plot
plt.show()
In [244]:
# Identify continuous variables
continuous_vars = ['current_price', 'raw_price', 'discount', 'likes_count']

# Create box plots for each continuous variable
plt.figure(figsize=(15, 10))
for i, var in enumerate(continuous_vars):
    plt.subplot(2, 2, i + 1)
    sns.boxplot(y=test_set_kids[var])
    plt.title(f'Box Plot of {var}')

plt.tight_layout()
plt.show()
In [245]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'data_cleaned' is your DataFrame

# Define a function to cap outliers using min and max values
def cap_outliers(df, column):
  Q1 = df[column].quantile(0.25)
  Q3 = df[column].quantile(0.75)
  IQR = Q3 - Q1
  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR
  df[column] = df[column].clip(lower_bound, upper_bound)
  return df

# Apply the function to the columns with outliers
for column in ['current_price', 'raw_price', 'discount', 'likes_count']:
  test_set_kids = cap_outliers(test_set_kids, column)


# Create box plots for each continuous variable after outlier treatment
plt.figure(figsize=(15, 10))
for i, var in enumerate(['current_price', 'raw_price', 'discount', 'likes_count']):
  plt.subplot(2, 2, i + 1)
  sns.boxplot(y=test_set_kids[var])
  plt.title(f'Box Plot of {var} (Outliers Treated)')

plt.tight_layout()
plt.show()
In [246]:
import sklearn
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Identify non-continuous (categorical) variables
categorical_vars = ['subcategory','name']

# Apply label encoding to each categorical variable
for var in categorical_vars:
  test_set_kids[var] = label_encoder.fit_transform(test_set_kids[var])

test_set_kids.head()
Out[246]:
category subcategory name current_price raw_price discount likes_count
599 kids 24 238 34.580 87.14 60.0 192
752 kids 18 143 19.050 47.99 60.0 43
2016 kids 15 20 22.060 55.57 60.0 60
1001 kids 24 249 38.745 69.99 42.5 176
2514 kids 10 73 22.530 57.58 61.0 146
In [247]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'data_cleaned' is your DataFrame

# Create discount bins
test_set_kids['discount_bin'] = pd.cut(test_set_kids['discount'], bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                                      labels=['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100'])

# Calculate mean likes per discount bin
discount_likes = test_set_kids.groupby('discount_bin')['likes_count'].mean().sort_values()

# Plotting the effect of discount on popularity
plt.figure(figsize=(10, 6))
sns.barplot(x=discount_likes.index, y=discount_likes.values, palette='coolwarm')
plt.title('Effect of Discount on Likes Count for kids')
plt.xlabel('Discount Range (%)')
plt.ylabel('Average Likes Count')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.tight_layout()  # Adjust layout to prevent labels from overlapping
plt.show()
In [248]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'data_cleaned' is your DataFrame

# Plotting the relationship between price and likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='current_price', y='likes_count', data=test_set_kids, color='blue', alpha=0.6)
plt.title('Relationship Between Price and Likes Count for kids')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
plt.show()
In [249]:
import pandas as pd
import matplotlib.pyplot as plt


# Extract current_price and likes_count columns
current_price = test_set_kids['current_price']
likes_count = test_set_kids['likes_count']

# Calculate the midpoints for current_price and likes_count
mid_price = (current_price.max() + current_price.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='blue', alpha=0.3)

# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at the midpoint of likes
plt.axvline(mid_price, color='black', linewidth=3, linestyle='--')  # Vertical line at the midpoint of price

# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Equal Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mid_price + (mid_price * 0.05), mid_likes + (mid_likes * 0.05), 'Q1', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes + (mid_likes * 0.05), 'Q2', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes - (mid_likes * 0.4), 'Q3', fontsize=14, color='red')
plt.text(mid_price + (mid_price * 0.05), mid_likes - (mid_likes * 0.4), 'Q4', fontsize=14, color='red')

# Show the plot
plt.show()

# Calculate the mean for current_price and likes_count
mean_price = current_price.mean()
mean_likes = likes_count.mean()

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='green', alpha=0.3)

# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at mean of likes
plt.axvline(mean_price, color='black', linewidth=3, linestyle='--')  # Vertical line at mean of price

# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Four Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mean_price + (mean_price * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_price + (mean_price * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')

# Show the plot
plt.show()
In [250]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

# Select features for clustering (price and discount)
X = test_set_kids[['current_price', 'discount']]

# Perform K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
test_set_kids['cluster'] = kmeans.fit_predict(X)

# Calculate average likes per cluster
cluster_likes = test_set_kids.groupby('cluster')['likes_count'].mean()

# Plot average likes per cluster
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

# Assuming 'data_cleaned' is your DataFrame

# Select features for clustering (price and discount)
X = test_set_kids[['current_price', 'discount']]

# Perform K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42)  # Adjust n_clusters as needed
test_set_kids['cluster'] = kmeans.fit_predict(X)

# Calculate average likes per cluster
cluster_likes = test_set_kids.groupby('cluster')['likes_count'].mean()

# Plot average likes per cluster
plt.figure(figsize=(10, 6))
sns.barplot(x=cluster_likes.index, y=cluster_likes.values, palette='Set2')
plt.title('Average Likes per Cluster (Based on Price and Discount)')
plt.xlabel('Cluster')
plt.ylabel('Average Likes')
plt.show()
C:\Users\santa\AppData\Roaming\Python\Python311\site-packages\sklearn\cluster\_kmeans.py:1429: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2.
  warnings.warn(
C:\Users\santa\AppData\Roaming\Python\Python311\site-packages\sklearn\cluster\_kmeans.py:1429: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2.
  warnings.warn(

Shoes¶

In [366]:
shoes = pd.read_csv('shoes.csv')
In [367]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
shoes = pd.read_csv('shoes.csv')

# Split the dataset
train_set_shoes, test_set_shoes = train_test_split(shoes, test_size=0.1, random_state=42)

# Display the number of rows in each set
print("Training set shoes size:", len(train_set_shoes))
print("Testing set shoes size:", len(test_set_shoes))
Training set shoes size: 10640
Testing set shoes size: 1183

Shoes and Random 10%¶

In [368]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style for the plots
sns.set(style="whitegrid")

# Plot: Distribution of product categories
plt.figure(figsize=(10, 6))
sns.countplot(y='subcategory', data=test_set_shoes, order=test_set_shoes['subcategory'].value_counts().index[:10], palette='viridis')
plt.title('Top 10 Shoes ')
plt.xlabel('Count')
plt.ylabel('subcategory')
plt.show()

# Get the counts of each subcategory
subcategory_counts = test_set_shoes['subcategory'].value_counts()

# Get the Top 10 subcategories
top_10_subcategories = subcategory_counts.head(10)
total_subcategories = subcategory_counts.sum()
# Display the Top 10 subcategories and their counts
print(top_10_subcategories)
print('Total Shoes                    ',total_subcategories)
subcategory
Bottes & Bottines                181
Mocassins                        177
Derbies & Mocassins              167
Baskets                          136
Sandales & Mules                 118
Sneakers & Baskets                98
Sandales                          70
Bottes & Chaussures montantes     66
Chaussures de ville               45
Escarpins                         36
Name: count, dtype: int64
Total Shoes                     1183
In [369]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Assuming 'data_cleaned' is your DataFrame

# Get the counts of each subcategory
subcategory_counts = test_set_shoes['subcategory'].value_counts()

# Get the bottom 10 subcategories
bottom_10_subcategories = subcategory_counts.tail(10)

# Set the style for the plots
sns.set(style="whitegrid")

# Plot: Distribution of bottom 10 product categories
plt.figure(figsize=(10, 6))
sns.countplot(y=test_set_shoes['subcategory'],
              order=bottom_10_subcategories.index,
              palette='viridis')
plt.title('Shoes Bottom 10 Product Categories')
plt.xlabel('Count')
plt.ylabel('Subcategory')
plt.show()

# Display the bottom 10 subcategories and their counts
print(bottom_10_subcategories)
subcategory
Escarpins                 36
Claquettes & Tongs        35
Chaussons                 20
Slipper                   14
Plateforme                 7
Chaussures de sport        4
Sandals                    3
Pumps                      2
ACCESSOIRES CHAUSSURES     2
Flat & Loafers             2
Name: count, dtype: int64
In [371]:
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_shoes.groupby('subcategory')['likes_count'].sum().sort_values(ascending=False)

# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]

# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Shoes with Highest Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()

# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory
Bottes & Bottines                77694
Derbies & Mocassins              76947
Sandales & Mules                 46361
Mocassins                        26790
Escarpins                        23465
Sneakers & Baskets               20580
Baskets                          13067
Chaussures de ville              12249
Bottes & Chaussures montantes    11233
Sandales                          9621
Name: likes_count, dtype: int64
In [372]:
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_shoes.groupby('subcategory')['likes_count'].sum().sort_values(ascending=True)

# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]

# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Shoes with Least Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()

# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory
Flat & Loafers              99
Pumps                      187
Sandals                    222
Chaussures de sport        497
Slipper                    504
ACCESSOIRES CHAUSSURES     649
Plateforme                1062
Chaussons                 1285
Claquettes & Tongs        6003
Sandales                  9621
Name: likes_count, dtype: int64
In [257]:
# Plot: Distribution of current prices
plt.figure(figsize=(10, 6))
sns.histplot(test_set_shoes['current_price'], kde=True, bins=30, color='blue')
plt.title('Distribution of Current Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()
In [258]:
# Plot: Scatter plot of discount vs. likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='discount', y='likes_count', data=test_set_shoes, color='red')
plt.title('shoes Discount vs. Likes Count')
plt.xlabel('Discount (%)')
plt.ylabel('Likes Count')
plt.show()
In [259]:
import pandas as pd
import matplotlib.pyplot as plt


# Extract discount and likes_count columns
discount = test_set_shoes['discount']
likes_count = test_set_shoes['likes_count']

# Calculate the midpoints for discount and likes_count
mid_discount = (discount.max() + discount.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='blue', alpha=0.3)

# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at the midpoint of likes
plt.axvline(mid_discount, color='black', linewidth=3, linestyle='--')  # Vertical line at the midpoint of discount

# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Equal Quadrants by Mid Value')
plt.xlabel('Discount')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mid_discount + (mid_discount * 0.05), mid_likes + (mid_likes * 0.05), '   Q1', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes + (mid_likes * 0.05), '    Q2', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes - (mid_likes * 0.4), '     Q3', fontsize=14, color='red')
plt.text(mid_discount + (mid_discount * 0.05), mid_likes - (mid_likes * 0.4), '    Q4', fontsize=14, color='red')

# Show the plot
plt.show()

# Calculate the mean for discount and likes_count
mean_discount = discount.mean()
mean_likes = likes_count.mean()

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='green', alpha=0.3)

# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at mean of likes
plt.axvline(mean_discount, color='black', linewidth=3, linestyle='--')  # Vertical line at mean of price

# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Four Quadrants by Mean')
plt.xlabel('discount')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mean_discount + (mean_discount * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_discount + (mean_discount * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')

# Show the plot
plt.show()
In [260]:
# Identify continuous variables
continuous_vars = ['current_price', 'raw_price', 'discount', 'likes_count']

# Create box plots for each continuous variable
plt.figure(figsize=(15, 10))
for i, var in enumerate(continuous_vars):
    plt.subplot(2, 2, i + 1)
    sns.boxplot(y=test_set_shoes[var])
    plt.title(f'Box Plot of {var}')

plt.tight_layout()
plt.show()
In [261]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'data_cleaned' is your DataFrame

# Define a function to cap outliers using min and max values
def cap_outliers(df, column):
  Q1 = df[column].quantile(0.25)
  Q3 = df[column].quantile(0.75)
  IQR = Q3 - Q1
  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR
  df[column] = df[column].clip(lower_bound, upper_bound)
  return df

# Apply the function to the columns with outliers
for column in ['current_price', 'raw_price', 'discount', 'likes_count']:
  test_set_shoes = cap_outliers(test_set_shoes, column)


# Create box plots for each continuous variable after outlier treatment
plt.figure(figsize=(15, 10))
for i, var in enumerate(['current_price', 'raw_price', 'discount', 'likes_count']):
  plt.subplot(2, 2, i + 1)
  sns.boxplot(y=test_set_shoes[var])
  plt.title(f'Box Plot of {var} (Outliers Treated)')

plt.tight_layout()
plt.show()
In [262]:
import sklearn
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Identify non-continuous (categorical) variables
categorical_vars = ['subcategory','name']

# Apply label encoding to each categorical variable
for var in categorical_vars:
  test_set_shoes[var] = label_encoder.fit_transform(test_set_shoes[var])

test_set_shoes.head()
Out[262]:
category subcategory name current_price raw_price discount likes_count
8169 shoes 15 1056 31.94 64.64 51 87
900 shoes 8 297 62.03 107.70 42 93
8075 shoes 14 1006 56.69 80.08 33 423
7625 shoes 14 1025 38.69 93.59 59 31
2816 shoes 1 1103 25.19 69.22 64 14
In [263]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'data_cleaned' is your DataFrame

# Create discount bins
test_set_shoes['discount_bin'] = pd.cut(test_set_shoes['discount'], bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                                      labels=['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100'])

# Calculate mean likes per discount bin
discount_likes = test_set_shoes.groupby('discount_bin')['likes_count'].mean().sort_values()

# Plotting the effect of discount on popularity
plt.figure(figsize=(10, 6))
sns.barplot(x=discount_likes.index, y=discount_likes.values, palette='coolwarm')
plt.title('Effect of Discount on Likes Count for shoes')
plt.xlabel('Discount Range (%)')
plt.ylabel('Average Likes Count')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.tight_layout()  # Adjust layout to prevent labels from overlapping
plt.show()
In [264]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'data_cleaned' is your DataFrame

# Plotting the relationship between price and likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='current_price', y='likes_count', data=test_set_shoes, color='blue', alpha=0.6)
plt.title('Relationship Between Price and Likes Count for shoes')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
plt.show()
In [265]:
import pandas as pd
import matplotlib.pyplot as plt


# Extract current_price and likes_count columns
current_price = test_set_shoes['current_price']
likes_count = test_set_shoes['likes_count']

# Calculate the midpoints for current_price and likes_count
mid_price = (current_price.max() + current_price.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='blue', alpha=0.3)

# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at the midpoint of likes
plt.axvline(mid_price, color='black', linewidth=3, linestyle='--')  # Vertical line at the midpoint of price

# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Equal Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mid_price + (mid_price * 0.05), mid_likes + (mid_likes * 0.05), 'Q1', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes + (mid_likes * 0.05), 'Q2', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes - (mid_likes * 0.4), 'Q3', fontsize=14, color='red')
plt.text(mid_price + (mid_price * 0.05), mid_likes - (mid_likes * 0.4), 'Q4', fontsize=14, color='red')

# Show the plot
plt.show()

# Calculate the mean for current_price and likes_count
mean_price = current_price.mean()
mean_likes = likes_count.mean()

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='green', alpha=0.3)

# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at mean of likes
plt.axvline(mean_price, color='black', linewidth=3, linestyle='--')  # Vertical line at mean of price

# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Four Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mean_price + (mean_price * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_price + (mean_price * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')

# Show the plot
plt.show()
In [266]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

# Select features for clustering (price and discount)
X = test_set_shoes[['current_price', 'discount']]

# Perform K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
test_set_shoes['cluster'] = kmeans.fit_predict(X)

# Calculate average likes per cluster
cluster_likes = test_set_shoes.groupby('cluster')['likes_count'].mean()

# Plot average likes per cluster
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

# Assuming 'data_cleaned' is your DataFrame

# Select features for clustering (price and discount)
X = test_set_shoes[['current_price', 'discount']]

# Perform K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42)  # Adjust n_clusters as needed
test_set_shoes['cluster'] = kmeans.fit_predict(X)

# Calculate average likes per cluster
cluster_likes = test_set_shoes.groupby('cluster')['likes_count'].mean()

# Plot average likes per cluster
plt.figure(figsize=(10, 6))
sns.barplot(x=cluster_likes.index, y=cluster_likes.values, palette='Set2')
plt.title('Average Likes per Cluster (Based on Price and Discount)')
plt.xlabel('Cluster')
plt.ylabel('Average Likes')
plt.show()

Women¶

In [373]:
women = pd.read_csv('women.csv')
In [374]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
women = pd.read_csv('women.csv')

# Split the dataset
train_set_women, test_set_women = train_test_split(women, test_size=0.1, random_state=42)

# Display the number of rows in each set
print("Training set women size:", len(train_set_women))
print("Testing set women size:", len(test_set_women))
Training set women size: 13328
Testing set women size: 1481

Women and Random 10%¶

In [375]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style for the plots
sns.set(style="whitegrid")

# Plot: Distribution of product categories
plt.figure(figsize=(10, 6))
sns.countplot(y='subcategory', data=test_set_women, order=test_set_women['subcategory'].value_counts().index[:10], palette='viridis')
plt.title('Top 10 Women Catergory Products ')
plt.xlabel('Count')
plt.ylabel('subcategory')
plt.show()

# Get the counts of each subcategory
subcategory_counts = test_set_women['subcategory'].value_counts()

# Get the Top 10 subcategories
top_10_subcategories = subcategory_counts.head(10)
total_subcategories = subcategory_counts.sum()

# Display the Top 10 subcategories and their counts
print(top_10_subcategories)
print('Total women          ',total_subcategories)
subcategory
Chemises               164
Blouses & Chemises     162
Robes imprimées        131
T-shirts               128
Soutiens-gorge         104
Pantalons & Shorts      85
Robes décontractées     84
Culotte haute           64
Vestes & Gilets         63
Pulls & Cardigans       53
Name: count, dtype: int64
Total women           1481
In [376]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Assuming 'data_cleaned' is your DataFrame

# Get the counts of each subcategory
subcategory_counts = test_set_women['subcategory'].value_counts()

# Get the bottom 10 subcategories
bottom_10_subcategories = subcategory_counts.tail(10)

# Set the style for the plots
sns.set(style="whitegrid")

# Plot: Distribution of bottom 10 product categories
plt.figure(figsize=(10, 6))
sns.countplot(y=test_set_women['subcategory'],
              order=bottom_10_subcategories.index,
              palette='viridis')
plt.title('Women Bottom 10 Product Categories ')
plt.xlabel('Count')
plt.ylabel('Subcategory')
plt.show()

# Display the bottom 10 subcategories and their counts
print(bottom_10_subcategories)
subcategory
Survêtements        2
Bas                 2
Shorts              1
Blazers             1
Combinaison         1
Onesies             1
MANTEAUX & PULLS    1
Tops                1
Shortys             1
Sweats              1
Name: count, dtype: int64
In [377]:
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_women.groupby('subcategory')['likes_count'].sum().sort_values(ascending=False)

# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]

# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Women with Highest Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()

# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory
Blouses & Chemises              46617
Soutiens-gorge                  46451
Chemises                        40862
Robes imprimées                 23102
Pantalons & Shorts              21939
T-shirts                        20485
Robes décontractées             20397
Vestes & Gilets                 15063
Robes vintage                   14415
Combinaisons & Grenouillères    10702
Name: likes_count, dtype: int64
In [378]:
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_women.groupby('subcategory')['likes_count'].sum().sort_values(ascending=True)

# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]

# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Women with Least Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()

# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory
Tops                  15
Shorts                23
Blazers               33
MANTEAUX & PULLS      68
Onesies               73
Shortys               89
Combinaison          127
Survêtements         138
Robes en dentelle    139
Boho Dresses         162
Name: likes_count, dtype: int64
In [273]:
# Plot: Distribution of current prices
plt.figure(figsize=(10, 6))
sns.histplot(test_set_women['current_price'], kde=True, bins=30, color='blue')
plt.title('Distribution of Current Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()
In [274]:
# Plot: Scatter plot of discount vs. likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='discount', y='likes_count', data=test_set_women, color='red')
plt.title('women Discount vs. Likes Count')
plt.xlabel('Discount (%)')
plt.ylabel('Likes Count')
plt.show()
In [275]:
import pandas as pd
import matplotlib.pyplot as plt


# Extract discount and likes_count columns
discount = test_set_women['discount']
likes_count = test_set_women['likes_count']

# Calculate the midpoints for discount and likes_count
mid_discount = (discount.max() + discount.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='blue', alpha=0.3)

# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at the midpoint of likes
plt.axvline(mid_discount, color='black', linewidth=3, linestyle='--')  # Vertical line at the midpoint of discount

# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Equal Quadrants by Mid Value')
plt.xlabel('Discount')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mid_discount + (mid_discount * 0.05), mid_likes + (mid_likes * 0.05), '   Q1', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes + (mid_likes * 0.05), '    Q2', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes - (mid_likes * 0.4), '     Q3', fontsize=14, color='red')
plt.text(mid_discount + (mid_discount * 0.05), mid_likes - (mid_likes * 0.4), '    Q4', fontsize=14, color='red')

# Show the plot
plt.show()

# Calculate the mean for discount and likes_count
mean_discount = discount.mean()
mean_likes = likes_count.mean()

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='green', alpha=0.3)

# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at mean of likes
plt.axvline(mean_discount, color='black', linewidth=3, linestyle='--')  # Vertical line at mean of price

# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Four Quadrants by Mean')
plt.xlabel('discount')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mean_discount + (mean_discount * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_discount + (mean_discount * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')

# Show the plot
plt.show()
In [276]:
# Identify continuous variables
continuous_vars = ['current_price', 'raw_price', 'discount', 'likes_count']

# Create box plots for each continuous variable
plt.figure(figsize=(15, 10))
for i, var in enumerate(continuous_vars):
    plt.subplot(2, 2, i + 1)
    sns.boxplot(y=test_set_women[var])
    plt.title(f'Box Plot of {var}')

plt.tight_layout()
plt.show()
In [277]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'data_cleaned' is your DataFrame

# Define a function to cap outliers using min and max values
def cap_outliers(df, column):
  Q1 = df[column].quantile(0.25)
  Q3 = df[column].quantile(0.75)
  IQR = Q3 - Q1
  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR
  df[column] = df[column].clip(lower_bound, upper_bound)
  return df

# Apply the function to the columns with outliers
for column in ['current_price', 'raw_price', 'discount', 'likes_count']:
  test_set_women = cap_outliers(test_set_women, column)


# Create box plots for each continuous variable after outlier treatment
plt.figure(figsize=(15, 10))
for i, var in enumerate(['current_price', 'raw_price', 'discount', 'likes_count']):
  plt.subplot(2, 2, i + 1)
  sns.boxplot(y=test_set_women[var])
  plt.title(f'Box Plot of {var} (Outliers Treated)')

plt.tight_layout()
plt.show()
In [278]:
import sklearn
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Identify non-continuous (categorical) variables
categorical_vars = ['subcategory','name']

# Apply label encoding to each categorical variable
for var in categorical_vars:
  test_set_women[var] = label_encoder.fit_transform(test_set_women[var])

test_set_women.head()
Out[278]:
category subcategory name current_price raw_price discount likes_count
12853 women 47 1410 29.64 57.990 49 429.0
11804 women 38 7 21.10 40.910 48 125.0
2662 women 1 13 14.29 29.230 51 28.0
11887 women 15 576 45.49 95.715 50 31.0
14799 women 49 1455 19.99 39.990 50 15.0
In [279]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'data_cleaned' is your DataFrame

# Create discount bins
test_set_women['discount_bin'] = pd.cut(test_set_women['discount'], bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                                      labels=['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100'])

# Calculate mean likes per discount bin
discount_likes = test_set_women.groupby('discount_bin')['likes_count'].mean().sort_values()

# Plotting the effect of discount on popularity
plt.figure(figsize=(10, 6))
sns.barplot(x=discount_likes.index, y=discount_likes.values, palette='coolwarm')
plt.title('Effect of Discount on Likes Count for women')
plt.xlabel('Discount Range (%)')
plt.ylabel('Average Likes Count')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.tight_layout()  # Adjust layout to prevent labels from overlapping
plt.show()
In [280]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'data_cleaned' is your DataFrame

# Plotting the relationship between price and likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='current_price', y='likes_count', data=test_set_women, color='blue', alpha=0.6)
plt.title('Relationship Between Price and Likes Count for women')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
plt.show()
In [281]:
import pandas as pd
import matplotlib.pyplot as plt


# Extract current_price and likes_count columns
current_price = test_set_women['current_price']
likes_count = test_set_women['likes_count']

# Calculate the midpoints for current_price and likes_count
mid_price = (current_price.max() + current_price.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='blue', alpha=0.3)

# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at the midpoint of likes
plt.axvline(mid_price, color='black', linewidth=3, linestyle='--')  # Vertical line at the midpoint of price

# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Equal Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mid_price + (mid_price * 0.05), mid_likes + (mid_likes * 0.05), 'Q1', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes + (mid_likes * 0.05), 'Q2', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes - (mid_likes * 0.4), 'Q3', fontsize=14, color='red')
plt.text(mid_price + (mid_price * 0.05), mid_likes - (mid_likes * 0.4), 'Q4', fontsize=14, color='red')

# Show the plot
plt.show()

# Calculate the mean for current_price and likes_count
mean_price = current_price.mean()
mean_likes = likes_count.mean()

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='green', alpha=0.3)

# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at mean of likes
plt.axvline(mean_price, color='black', linewidth=3, linestyle='--')  # Vertical line at mean of price

# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Four Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mean_price + (mean_price * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_price + (mean_price * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')

# Show the plot
plt.show()
In [282]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

# Select features for clustering (price and discount)
X = test_set_women[['current_price', 'discount']]

# Perform K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
test_set_women['cluster'] = kmeans.fit_predict(X)

# Calculate average likes per cluster
cluster_likes = test_set_women.groupby('cluster')['likes_count'].mean()

# Plot average likes per cluster
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

# Assuming 'data_cleaned' is your DataFrame

# Select features for clustering (price and discount)
X = test_set_women[['current_price', 'discount']]

# Perform K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42)  # Adjust n_clusters as needed
test_set_women['cluster'] = kmeans.fit_predict(X)

# Calculate average likes per cluster
cluster_likes = test_set_women.groupby('cluster')['likes_count'].mean()

# Plot average likes per cluster
plt.figure(figsize=(10, 6))
sns.barplot(x=cluster_likes.index, y=cluster_likes.values, palette='Set2')
plt.title('Average Likes per Cluster (Based on Price and Discount)')
plt.xlabel('Cluster')
plt.ylabel('Average Likes')
plt.show()

Men¶

In [379]:
men = pd.read_csv('men.csv')
In [380]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
men = pd.read_csv('men.csv')

# Split the dataset
train_set_men, test_set_men = train_test_split(men, test_size=0.1, random_state=42)

# Display the number of rows in each set
print("Training set men size:", len(train_set_men))
print("Testing set men size:", len(test_set_men))
Training set men size: 9187
Testing set men size: 1021

Men and Random 10%¶

In [285]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style for the plots
sns.set(style="whitegrid")

# Plot: Distribution of product categories
plt.figure(figsize=(10, 6))
sns.countplot(y='subcategory', data=test_set_men, order=test_set_men['subcategory'].value_counts().index[:10], palette='viridis')
plt.title('Top 10 Men Category Products ')
plt.xlabel('Count')
plt.ylabel('subcategory')
plt.show()

# Get the counts of each subcategory
subcategory_counts = test_set_men['subcategory'].value_counts()

# Get the Top 10 subcategories
top_10_subcategories = subcategory_counts.head(10)
total_subcategories = subcategory_counts.sum()

# Display the Top 10 subcategories and their counts
print(top_10_subcategories)
print('Total men            ',total_subcategories)
subcategory
Shirts            274
T-Shirts          162
Boxers             91
Vestes             50
Pantalons          45
Hoodies            42
Shorts de bain     41
Pyjama             40
Henley Shirts      38
Slips              32
Name: count, dtype: int64
Total men             1021
In [286]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Assuming 'data_cleaned' is your DataFrame

# Get the counts of each subcategory
subcategory_counts = test_set_men['subcategory'].value_counts()

# Get the bottom 10 subcategories
bottom_10_subcategories = subcategory_counts.tail(10)

# Set the style for the plots
sns.set(style="whitegrid")

# Plot: Distribution of bottom 10 product categories
plt.figure(figsize=(10, 6))
sns.countplot(y=test_set_men['subcategory'],
              order=bottom_10_subcategories.index,
              palette='viridis')
plt.title('Men Bottom 10 Product Categories')
plt.xlabel('Count')
plt.ylabel('Subcategory')
plt.show()

# Display the bottom 10 subcategories and their counts
print(bottom_10_subcategories)
subcategory
Doudounes & Parkas    5
Tanks                 4
Jumpsuits             3
Robes                 2
Waistcoats            2
Débardeurs            2
VESTES & MANTEAUX     1
PULLS & GILETS        1
Trousers              1
Onesies               1
Name: count, dtype: int64
In [381]:
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_men.groupby('subcategory')['likes_count'].sum().sort_values(ascending=True)

# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]

# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Men with Highest Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()

# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory
Robes                 80
Tanks                105
SOUS-VÊTEMENTS       258
PULLS & GILETS       259
Onesies              274
Trousers             320
Waistcoats           338
Bottoms              377
VESTES & MANTEAUX    485
Débardeurs           645
Name: likes_count, dtype: int64
In [382]:
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_men.groupby('subcategory')['likes_count'].sum().sort_values(ascending=False)

# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]

# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Men with Highest Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()

# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory
Shirts           51149
T-Shirts         20691
Pantalons        17314
Henley Shirts    14051
Hoodies          10073
Vestes            9498
Boxers            6241
Shorts            4464
Slips             3932
Pyjama            3900
Name: likes_count, dtype: int64
In [289]:
# Plot: Distribution of current prices
plt.figure(figsize=(10, 6))
sns.histplot(test_set_men['current_price'], kde=True, bins=30, color='blue')
plt.title('Distribution of Current Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()
In [290]:
# Plot: Scatter plot of discount vs. likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='discount', y='likes_count', data=test_set_men, color='red')
plt.title('men Discount vs. Likes Count')
plt.xlabel('Discount (%)')
plt.ylabel('Likes Count')
plt.show()
In [291]:
import pandas as pd
import matplotlib.pyplot as plt


# Extract discount and likes_count columns
discount = test_set_men['discount']
likes_count = test_set_men['likes_count']

# Calculate the midpoints for discount and likes_count
mid_discount = (discount.max() + discount.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='blue', alpha=0.3)

# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at the midpoint of likes
plt.axvline(mid_discount, color='black', linewidth=3, linestyle='--')  # Vertical line at the midpoint of discount

# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Equal Quadrants by Mid Value')
plt.xlabel('Discount')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mid_discount + (mid_discount * 0.05), mid_likes + (mid_likes * 0.05), '   Q1', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes + (mid_likes * 0.05), '    Q2', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes - (mid_likes * 0.4), '     Q3', fontsize=14, color='red')
plt.text(mid_discount + (mid_discount * 0.05), mid_likes - (mid_likes * 0.4), '    Q4', fontsize=14, color='red')

# Show the plot
plt.show()

# Calculate the mean for discount and likes_count
mean_discount = discount.mean()
mean_likes = likes_count.mean()

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='green', alpha=0.3)

# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at mean of likes
plt.axvline(mean_discount, color='black', linewidth=3, linestyle='--')  # Vertical line at mean of price

# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Four Quadrants by Mean')
plt.xlabel('discount')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mean_discount + (mean_discount * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_discount + (mean_discount * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')

# Show the plot
plt.show()
In [292]:
# Identify continuous variables
continuous_vars = ['current_price', 'raw_price', 'discount', 'likes_count']

# Create box plots for each continuous variable
plt.figure(figsize=(15, 10))
for i, var in enumerate(continuous_vars):
    plt.subplot(2, 2, i + 1)
    sns.boxplot(y=test_set_men[var])
    plt.title(f'Box Plot of {var}')

plt.tight_layout()
plt.show()
In [293]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'data_cleaned' is your DataFrame

# Define a function to cap outliers using min and max values
def cap_outliers(df, column):
  Q1 = df[column].quantile(0.25)
  Q3 = df[column].quantile(0.75)
  IQR = Q3 - Q1
  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR
  df[column] = df[column].clip(lower_bound, upper_bound)
  return df

# Apply the function to the columns with outliers
for column in ['current_price', 'raw_price', 'discount', 'likes_count']:
  test_set_men = cap_outliers(test_set_men, column)


# Create box plots for each continuous variable after outlier treatment
plt.figure(figsize=(15, 10))
for i, var in enumerate(['current_price', 'raw_price', 'discount', 'likes_count']):
  plt.subplot(2, 2, i + 1)
  sns.boxplot(y=test_set_men[var])
  plt.title(f'Box Plot of {var} (Outliers Treated)')

plt.tight_layout()
plt.show()
In [294]:
import sklearn
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Identify non-continuous (categorical) variables
categorical_vars = ['subcategory','name']

# Apply label encoding to each categorical variable
for var in categorical_vars:
  test_set_men[var] = label_encoder.fit_transform(test_set_men[var])

test_set_men.head()
Out[294]:
category subcategory name current_price raw_price discount likes_count
5062 men 21 276 30.79 45.260 32 361.5
2405 men 22 464 31.19 64.220 51 92.0
9478 men 6 988 50.74 81.565 36 42.0
8713 men 29 917 17.99 35.990 50 11.0
9085 men 21 114 40.69 53.990 25 361.5
In [295]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'data_cleaned' is your DataFrame

# Create discount bins
test_set_men['discount_bin'] = pd.cut(test_set_men['discount'], bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                                      labels=['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100'])

# Calculate mean likes per discount bin
discount_likes = test_set_men.groupby('discount_bin')['likes_count'].mean().sort_values()

# Plotting the effect of discount on popularity
plt.figure(figsize=(10, 6))
sns.barplot(x=discount_likes.index, y=discount_likes.values, palette='coolwarm')
plt.title('Effect of Discount on Likes Count for men')
plt.xlabel('Discount Range (%)')
plt.ylabel('Average Likes Count')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.tight_layout()  # Adjust layout to prevent labels from overlapping
plt.show()
In [296]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'data_cleaned' is your DataFrame

# Plotting the relationship between price and likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='current_price', y='likes_count', data=test_set_men, color='blue', alpha=0.6)
plt.title('Relationship Between Price and Likes Count for men')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
plt.show()
In [297]:
import pandas as pd
import matplotlib.pyplot as plt


# Extract current_price and likes_count columns
current_price = test_set_men['current_price']
likes_count = test_set_men['likes_count']

# Calculate the midpoints for current_price and likes_count
mid_price = (current_price.max() + current_price.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='blue', alpha=0.3)

# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at the midpoint of likes
plt.axvline(mid_price, color='black', linewidth=3, linestyle='--')  # Vertical line at the midpoint of price

# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Equal Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mid_price + (mid_price * 0.05), mid_likes + (mid_likes * 0.05), 'Q1', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes + (mid_likes * 0.05), 'Q2', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes - (mid_likes * 0.4), 'Q3', fontsize=14, color='red')
plt.text(mid_price + (mid_price * 0.05), mid_likes - (mid_likes * 0.4), 'Q4', fontsize=14, color='red')

# Show the plot
plt.show()

# Calculate the mean for current_price and likes_count
mean_price = current_price.mean()
mean_likes = likes_count.mean()

# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='green', alpha=0.3)

# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--')  # Horizontal line at mean of likes
plt.axvline(mean_price, color='black', linewidth=3, linestyle='--')  # Vertical line at mean of price

# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Four Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')

# Display the quadrants labels
plt.text(mean_price + (mean_price * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_price + (mean_price * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')

# Show the plot
plt.show()
In [298]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

# Select features for clustering (price and discount)
X = test_set_men[['current_price', 'discount']]

# Perform K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
test_set_men['cluster'] = kmeans.fit_predict(X)

# Calculate average likes per cluster
cluster_likes = test_set_men.groupby('cluster')['likes_count'].mean()

# Plot average likes per cluster
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans

# Assuming 'data_cleaned' is your DataFrame

# Select features for clustering (price and discount)
X = test_set_men[['current_price', 'discount']]

# Perform K-Means clustering
kmeans = KMeans(n_clusters=4, random_state=42)  # Adjust n_clusters as needed
test_set_men['cluster'] = kmeans.fit_predict(X)

# Calculate average likes per cluster
cluster_likes = test_set_men.groupby('cluster')['likes_count'].mean()

# Plot average likes per cluster
plt.figure(figsize=(10, 6))
sns.barplot(x=cluster_likes.index, y=cluster_likes.values, palette='Set2')
plt.title('Average Likes per Cluster (Based on Price and Discount)')
plt.xlabel('Cluster')
plt.ylabel('Average Likes')
plt.show()
In [299]:
correlations = {
    'Category': ['Accessories', 'Bags', 'Beauty', 'House', 'Jewelry', 'Kids', 'Shoes', 'Women', 'Men'],
    'Correlation with Likes Count': [
        test_set_accessories['discount'].corr(test_set_accessories['likes_count']),
        test_set_bags['discount'].corr(test_set_bags['likes_count']),
        test_set_beauty['discount'].corr(test_set_beauty['likes_count']),
        test_set_house['discount'].corr(test_set_house['likes_count']),
        test_set_jewelry['discount'].corr(test_set_jewelry['likes_count']),
        test_set_kids['discount'].corr(test_set_kids['likes_count']),
        test_set_shoes['discount'].corr(test_set_shoes['likes_count']),
        test_set_women['discount'].corr(test_set_women['likes_count']),
        test_set_men['discount'].corr(test_set_men['likes_count'])
    ]
}

# Create a DataFrame and round the correlation values to 2 decimal places
correlation_df = pd.DataFrame(correlations)
correlation_df['Correlation with Likes Count'] = correlation_df['Correlation with Likes Count'].round(2)

# Display the DataFrame as a table
print(correlation_df)
      Category  Correlation with Likes Count
0  Accessories                         -0.09
1         Bags                         -0.08
2       Beauty                         -0.14
3        House                         -0.18
4      Jewelry                         -0.21
5         Kids                          0.07
6        Shoes                         -0.02
7        Women                          0.07
8          Men                         -0.22
In [300]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Dictionary of datasets
datasets = {
    'Accessories': test_set_accessories,
    'Bags': test_set_bags,
    'Beauty': test_set_beauty,
    'House': test_set_house,
    'Jewelry': test_set_jewelry,
    'Kids': test_set_kids,
    'Shoes': test_set_shoes,
    'Women': test_set_women,
    'Men': test_set_men
}

# Loop through each dataset to train and evaluate a Random Forest model
for dataset_name, dataset in datasets.items():
    # Drop rows with missing values in the columns of interest
    dataset = dataset[['discount', 'likes_count']].dropna()

    # Check if there are enough data points
    if len(dataset) < 10:
        print(f"{dataset_name}: Not enough data for training.")
        continue

    # Split the data into features and target
    X = dataset[['discount']]  # Feature
    y = dataset['likes_count']  # Target

    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and train the Random Forest model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate and print performance metrics
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"{dataset_name} - Mean Squared Error: {mse:.2f}, R^2 Score: {r2:.2f}")
Accessories - Mean Squared Error: 2740.65, R^2 Score: -0.08
Bags - Mean Squared Error: 14041.80, R^2 Score: -0.11
Beauty - Mean Squared Error: 17711.76, R^2 Score: -0.02
House - Mean Squared Error: 13511.00, R^2 Score: 0.01
Jewelry - Mean Squared Error: 15059.38, R^2 Score: 0.12
Kids - Mean Squared Error: 4806.38, R^2 Score: 0.04
Shoes - Mean Squared Error: 24012.94, R^2 Score: 0.03
Women - Mean Squared Error: 24338.87, R^2 Score: 0.04
Men - Mean Squared Error: 13952.36, R^2 Score: 0.00
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: